{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "## 导入包"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "import shutil\n",
    "from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n",
    "from transformers import DataCollatorWithPadding\n",
    "from transformers import AutoTokenizer\n",
    "from datasets import load_dataset\n",
    "import evaluate\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "metric = evaluate.load(\"accuracy\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 设置\n",
    "1. `MODEL_NAME_OR_PATH`:大模型的名称，这里使用的是`\"hfl/chinese-roberta-wwm-ext\"`\n",
    "2. `NUM_LABELS`:文本分类的类别数量，这个是按需而定，我这里分类是只有两类，因此就设置为2\n",
    "3. `MAX_LENGTH`:每一条文本的长度，这里设置为64。一般来说长度64就已经覆盖大部分场景了，128也是可以。但是128对显存要求更高一点。"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \"bert-base-chinese\"  # \"distilbert-base-uncased\"\n",
    "MODEL_NAME_OR_PATH = \"hfl/chinese-roberta-wwm-ext\"\n",
    "NUM_LABELS = 2\n",
    "MAX_LENGTH = 64\n",
    "\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\n",
    "    MODEL_NAME_OR_PATH, num_labels=NUM_LABELS)\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_dataset = load_dataset('csv', data_files={\n",
    "    'train': ['data_all/data/train_data.csv'],\n",
    "    'test': ['data_all/data/test_data.csv']})\n",
    "\n",
    "\n",
    "def preprocess_function(examples):\n",
    "    return tokenizer(examples[\"text\"], truncation=True, max_length=MAX_LENGTH)\n",
    "\n",
    "# 在实际工程中，会先使用`Tokenizer`把所有的文本转换成`input_ids`,`token_type_ids`,`attention_mask`，然后在训练的时候，这步就不再做了，目的是减少训练过程中cpu处理数据的时间，不给显卡休息时间。\n",
    "tokenized_text = text_dataset.map(preprocess_function, batched=True)\n",
    "\n",
    "data_collator = DataCollatorWithPadding(\n",
    "    tokenizer=tokenizer, max_length=MAX_LENGTH)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 模型训练设置\n",
    "1. `LOGGING_DIR`: 就是一个储存日志的文件夹。\n",
    "2. `MODEL_DIR`: 在模型训练的时候，会把每一阶段的模型都保存在文件夹下。"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "LOGGING_DIR = \"logging_dir\"\n",
    "MODEL_DIR = \"model_result\"\n",
    "shutil.rmtree(LOGGING_DIR, ignore_errors=True)\n",
    "\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=MODEL_DIR,\n",
    "    overwrite_output_dir=True,\n",
    "    logging_dir=LOGGING_DIR,\n",
    "    learning_rate=2e-5,\n",
    "    per_device_train_batch_size=32,  # 每一次batch，训练的数据的数量，如果显存高，可以32起步，如果一般，那可能就是个位数，比如2，4，8，16等。\n",
    "    per_device_eval_batch_size=32,  # 在评估的时候，batch的大小，看显存大小了。\n",
    "    do_eval=True,\n",
    "    evaluation_strategy=\"steps\",\n",
    "    eval_accumulation_steps=50,\n",
    "    eval_steps=50,\n",
    "    logging_steps=50,\n",
    "    save_steps=100,\n",
    "    num_train_epochs=4,  #训练多少轮\n",
    "    weight_decay=0.01,\n",
    "    save_total_limit=3,  # 模型每`eval_steps`步，就会保存一下模型，只会保存最新的3个模型，\n",
    "    jit_mode_eval=True,\n",
    "    fp16=True,\n",
    "    fp16_opt_level='O3',\n",
    "    load_best_model_at_end=True,  # 最后，加载效果最好的模型\n",
    "\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_metrics(eval_pred):\n",
    "    logits, labels = eval_pred\n",
    "    predictions = np.argmax(logits, axis=-1)\n",
    "    return metric.compute(predictions=predictions, references=labels)\n",
    "\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_text[\"train\"],\n",
    "    eval_dataset=tokenized_text[\"test\"],\n",
    "    tokenizer=tokenizer,\n",
    "    data_collator=data_collator,\n",
    "    compute_metrics=compute_metrics\n",
    ")\n",
    "\n",
    "trainer.train()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mynet2",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "110bc624a448454d574a0cd6cc76359fd86f75739e493913b3d71c2e04f2ffdb"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
